newBible = readRDS("newBible.rda")

library(tidytext)
## Warning: package 'tidytext' was built under R version 3.3.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'tibble' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'purrr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.3.2
versions = c('kjb',  'asv', 'drb',  'erv',  'wbt',  'web',  'ylt',  'akjv', 'wnt')

newBible$text = as.character(newBible$text)

unnested = newBible %>%
  unnest_tokens(word, text)
data(stop_words)

stopUnnest = unnested %>%
  anti_join(stop_words) 
## Joining, by = "word"
stopUnnest %>% 
  count(word, sort = TRUE) %>%
  filter(n > 10000) %>%
  mutate(word = reorder(word, n)) %>%
          ggplot(aes(word, n)) +
          geom_col() +
          xlab(NULL) +
          coord_flip()  +
          theme_solarized(light = TRUE) +
          ggtitle("Unique Word Frequency") +
          xlab("Frequency") +
          ylab("Count") +
          theme(axis.text.y = element_text(angle = 40, hjust = 1),
                text = element_text(size = 10))
## Warning: package 'bindrcpp' was built under R version 3.3.2

# stopUnnest %>%
#   count(word, sort = TRUE) %>%
#   filter(n > 600) %>%
#   mutate(word = reorder(word, n)) %>%
#   ggplot(aes(word, n)) +
#   geom_col() +
#   xlab(NULL) +
#   coord_flip()
bibleSentiment = stopUnnest %>%
  inner_join(get_sentiments("bing")) %>%
  count(bookName, index = chapter, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) 
## Joining, by = "word"
bibleSentiment$direction = ifelse(bibleSentiment$sentiment >= 0, "Positive", "Negative")

ggplot(bibleSentiment, aes(reorder(bookName, sentiment, FUN = mean), sentiment)) +
  geom_col() +
  xlab(NULL) +
          coord_flip()  +
          theme_solarized(light = TRUE) +
          ggtitle("Unique Word Frequency") +
          xlab("Frequency") +
          ylab("Count") +
          theme(axis.text.y = element_text(angle = 40, hjust = 1),
                text = element_text(size = 7))

gg=ggplot(bibleSentiment %>% 
         filter(bookName %in% c("Matthew", 
                                "Mark", 
                                "Psalm", 
                                "Genesis",
                                "Exodus",
                                "Joshua",
                                "Revelation")), 
       aes(index, sentiment, fill = direction, colour = direction)) +
  geom_col(show.legend = FALSE) +
  # geom_text(aes(label=direction), vjust=0)
  geom_bar(stat="identity", 
              position="identity", 
              colour="black", 
              size = 0.5) +
  facet_wrap(~bookName, ncol = 2, scales = "free_x")
plotly::ggplotly(gg)
library(wordcloud)
## Loading required package: RColorBrewer
stopUnnest %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
stopUnnest %>%
  inner_join(get_sentiments("bing")) %>%
  filter(bookName == "Matthew") %>% 
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 50)
## Joining, by = "word"

book_words <- stopUnnest %>%
  count(bookName, word, sort = TRUE) %>%
  ungroup()

total_words <- book_words %>% 
  group_by(bookName) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "bookName"
freq_by_rank <- book_words %>% 
  group_by(bookName) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
ggplot(book_words %>% 
         filter(bookName == c("Genesis",
                              "Matthew",
                              "Exodus",
                              "Leviticus")), aes(n/total, fill = bookName)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~bookName, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 210 rows containing non-finite values (stat_bin).

freq_by_rank <- book_words %>% 
  group_by(bookName) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = bookName)) + 
  geom_line(size = 1.2, alpha = 0.8) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##      -1.016       -0.866
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = bookName)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.2, alpha = 0.8) + 
  scale_x_log10() +
  scale_y_log10()